Script to reproduce years based on a model trained with random points¶

Importing¶

In [ ]:
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import TransformedTargetRegressor

from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import BaggingRegressor

from sklearn.metrics import root_mean_squared_error as rmse

from tqdm import tqdm

import dill
import random

import salishsea_tools.viz_tools as sa_vi

Datasets Preparation¶

In [ ]:
def datasets_preparation(dataset, dataset2):
    
    drivers = np.stack([np.ravel(dataset['Temperature_(0m-15m)']),
        np.ravel(dataset['Temperature_(15m-100m)']), 
        np.ravel(dataset['Salinity_(0m-15m)']),
        np.ravel(dataset['Salinity_(15m-100m)']),
        np.ravel(dataset2['Summation_of_solar_radiation']),
        np.ravel(dataset2['Mean_wind_speed']),
        np.ravel(dataset2['Mean_air_temperature'])
        ])
    indx = np.where(~np.isnan(drivers).any(axis=0))
    drivers = drivers[:,indx[0]]

    diat = np.ravel(dataset['Diatom'])
    diat = diat[indx[0]]

    return(drivers, diat, indx)

Regressor¶

In [ ]:
def regressor (inputs, targets):
    
    inputs = inputs.transpose()
    
    # Regressor
    X_train, _, y_train, _ = train_test_split(inputs, targets, train_size=0.35)

    model = ExtraTreesRegressor()
    model = make_pipeline(StandardScaler(), model)
    regr = BaggingRegressor(model, n_estimators=12, n_jobs=4).fit(X_train, y_train)

    return (regr)

Regressor 2¶

In [ ]:
def regressor2 (inputs, targets, variable_name):

    inputs2 = inputs.transpose()

    outputs_test = regr.predict(inputs2)
   
    m = scatter_plot(targets, outputs_test, variable_name) 
    r = np.round(np.corrcoef(targets, outputs_test)[0][1],3)
    rms = rmse(targets, outputs_test)

    return (r, rms, m)

Regressor 3¶

In [ ]:
def regressor3 (inputs, targets):
  
    inputs2 = inputs.transpose()

    outputs_test = regr.predict(inputs2)
   
    # compute slope m and intercept b
    m, b = np.polyfit(targets, outputs_test, deg=1)
    
    r = np.round(np.corrcoef(targets, outputs_test)[0][1],3)
    rms = rmse(targets, outputs_test)

    return (r, rms, m)

Regressor 4¶

In [ ]:
def regressor4 (inputs, targets, variable_name):

    inputs2 = inputs.transpose()
    
    outputs = regr.predict(inputs2)

    # Post processing
    indx2 = np.full((len(diat_i.y)*len(diat_i.x)),np.nan)
    indx2[indx[0]] = outputs
    model = np.reshape(indx2,(len(diat_i.y),len(diat_i.x)))

    m = scatter_plot(targets, outputs, variable_name + str(dates[i].date())) 

    # Preparation of the dataarray 
    model = xr.DataArray(model,
        coords = {'y': diat_i.y, 'x': diat_i.x},
        dims = ['y','x'],
        attrs=dict( long_name = variable_name + "Concentration",
        units="mmol m-2"),)
                        
    plotting3(targets, model, diat_i, variable_name)

Printing¶

In [ ]:
def printing (targets, outputs, m):

    print ('The amount of data points is', outputs.size)
    print ('The slope of the best fitting line is ', np.round(m,3))
    print ('The correlation coefficient is:', np.round(np.corrcoef(targets, outputs)[0][1],3))
    print (' The mean square error is:', rmse(targets,outputs))

Scatter Plot¶

In [ ]:
def scatter_plot(targets, outputs, variable_name):

    # compute slope m and intercept b
    m, b = np.polyfit(targets, outputs, deg=1)

    printing(targets, outputs, m)

    fig, ax = plt.subplots(2, figsize=(5,10), layout='constrained')

    ax[0].scatter(targets,outputs, alpha = 0.2, s = 10)

    lims = [np.min([ax[0].get_xlim(), ax[0].get_ylim()]),
        np.max([ax[0].get_xlim(), ax[0].get_ylim()])]

    # plot fitted y = m*x + b
    ax[0].axline(xy1=(0, b), slope=m, color='r')

    ax[0].set_xlabel('targets')
    ax[0].set_ylabel('outputs')
    ax[0].set_xlim(lims)
    ax[0].set_ylim(lims)
    ax[0].set_aspect('equal')

    ax[0].plot(lims, lims,linestyle = '--',color = 'k')

    h = ax[1].hist2d(targets,outputs, bins=100, cmap='jet', 
        range=[lims,lims], cmin=0.1, norm='log')
    
    ax[1].plot(lims, lims,linestyle = '--',color = 'k')

    # plot fitted y = m*x + b
    ax[1].axline(xy1=(0, b), slope=m, color='r')

    ax[1].set_xlabel('targets')
    ax[1].set_ylabel('outputs')
    ax[1].set_aspect('equal')

    fig.colorbar(h[3],ax=ax[1], location='bottom')

    fig.suptitle(variable_name)

    plt.show()

    return (m)

Plotting¶

In [ ]:
def plotting(variable, name):

    plt.plot(years,variable, marker = '.', linestyle = '')
    plt.xlabel('Years')
    plt.ylabel(name)
    plt.show()

Plotting 2¶

In [ ]:
def plotting2(variable,title):
    
    fig, ax = plt.subplots()

    scatter= ax.scatter(dates,variable, marker='.', c=pd.DatetimeIndex(dates).month)

    ax.legend(handles=scatter.legend_elements()[0], labels=['February','March','April'])
    fig.suptitle('Daily ' + title + ' (15 Feb - 30 Apr)')
    
    fig.show()

Plotting 3¶

In [ ]:
def plotting3(targets, model, variable, variable_name):

    fig, ax = plt.subplots(2,2, figsize = (10,15))

    cmap = plt.get_cmap('cubehelix')
    cmap.set_bad('gray')

    variable.plot(ax=ax[0,0], cmap=cmap, vmin = targets.min(), vmax =targets.max(), cbar_kwargs={'label': variable_name + ' Concentration  [mmol m-2]'})
    model.plot(ax=ax[0,1], cmap=cmap, vmin = targets.min(), vmax = targets.max(), cbar_kwargs={'label': variable_name + ' Concentration  [mmol m-2]'})
    ((variable-model) / variable * 100).plot(ax=ax[1,0], cmap=cmap, cbar_kwargs={'label': variable_name + ' Concentration  [percentage]'})

    plt.subplots_adjust(left=0.1,
        bottom=0.1, 
        right=0.95, 
        top=0.95, 
        wspace=0.35, 
        hspace=0.35)

    sa_vi.set_aspect(ax[0,0])
    sa_vi.set_aspect(ax[0,1])
    sa_vi.set_aspect(ax[1,0])


    ax[0,0].title.set_text(variable_name + ' (targets)')
    ax[0,1].title.set_text(variable_name + ' (outputs)')
    ax[1,0].title.set_text('targets - outputs')
    ax[1,1].axis('off')

    fig.suptitle(str(dates[i].date()))

    plt.show()
    

Training (Random Points)¶

In [ ]:
ds = xr.open_dataset('/data/ibougoudis/MOAD/files/integrated_model_var_old.nc')
ds2 = xr.open_dataset('/data/ibougoudis/MOAD/files/external_inputs.nc')

ds = ds.isel(time_counter = (np.arange(0, len(ds.time_counter),2)), 
    y=(np.arange(ds.y[0], ds.y[-1], 5)), 
    x=(np.arange(ds.x[0], ds.x[-1], 5)))

ds2 = ds2.isel(time_counter = (np.arange(0, len(ds2.time_counter),2)), 
    y=(np.arange(ds2.y[0], ds2.y[-1], 5)), 
    x=(np.arange(ds2.x[0], ds2.x[-1], 5)))

dates = pd.DatetimeIndex(ds['time_counter'].values)

drivers, diat, _ = datasets_preparation(ds, ds2)

regr = regressor(drivers, diat)

Other Years (Anually)¶

In [ ]:
years = range (2007,2024)

r_all = []
rms_all = []
slope_all = []

for year in tqdm(range (2007,2024)):
    
    dataset = ds.sel(time_counter=str(year))
    dataset2 = ds2.sel(time_counter=str(year))
    
    drivers, diat, _ = datasets_preparation(dataset, dataset2)

    r, rms, m = regressor2(drivers, diat, 'Diatom ' + str(year))
    
    r_all.append(r)
    rms_all.append(rms)
    slope_all.append(m)
    
plotting(np.transpose(r_all), 'Correlation Coefficient')
plotting(np.transpose(rms_all), 'Root Mean Square Error')
plotting (np.transpose(slope_all), 'Slope of the best fitting line')
  0%|          | 0/17 [00:00<?, ?it/s]
The amount of data points is 70794
The slope of the best fitting line is  0.79
The correlation coefficient is: 0.943
 The mean square error is: 0.055718561999310606
No description has been provided for this image
  6%|▌         | 1/17 [01:07<18:03, 67.71s/it]
The amount of data points is 70794
The slope of the best fitting line is  0.778
The correlation coefficient is: 0.926
 The mean square error is: 0.05640590600520877
No description has been provided for this image
 12%|█▏        | 2/17 [02:08<15:56, 63.74s/it]
The amount of data points is 68931
The slope of the best fitting line is  0.79
The correlation coefficient is: 0.943
 The mean square error is: 0.06900807497777764
No description has been provided for this image
 18%|█▊        | 3/17 [03:08<14:26, 61.93s/it]
The amount of data points is 70794
The slope of the best fitting line is  0.764
The correlation coefficient is: 0.937
 The mean square error is: 0.05398620175081627
No description has been provided for this image
 24%|██▎       | 4/17 [04:08<13:15, 61.19s/it]
The amount of data points is 68931
The slope of the best fitting line is  0.838
The correlation coefficient is: 0.952
 The mean square error is: 0.049096458547891166
No description has been provided for this image
 29%|██▉       | 5/17 [05:11<12:21, 61.81s/it]
The amount of data points is 70794
The slope of the best fitting line is  0.807
The correlation coefficient is: 0.946
 The mean square error is: 0.053694577698207104
No description has been provided for this image
 35%|███▌      | 6/17 [06:10<11:10, 60.97s/it]
The amount of data points is 70794
The slope of the best fitting line is  0.778
The correlation coefficient is: 0.944
 The mean square error is: 0.06381927261420603
No description has been provided for this image
 41%|████      | 7/17 [07:09<10:02, 60.26s/it]
The amount of data points is 68931
The slope of the best fitting line is  0.775
The correlation coefficient is: 0.922
 The mean square error is: 0.05772809530113693
No description has been provided for this image
 47%|████▋     | 8/17 [08:08<08:59, 59.97s/it]
The amount of data points is 70794
The slope of the best fitting line is  0.774
The correlation coefficient is: 0.943
 The mean square error is: 0.05384693281337726
No description has been provided for this image
 53%|█████▎    | 9/17 [09:08<07:58, 59.81s/it]
The amount of data points is 70794
The slope of the best fitting line is  0.828
The correlation coefficient is: 0.955
 The mean square error is: 0.052335081889689966
No description has been provided for this image
 59%|█████▉    | 10/17 [10:15<07:15, 62.15s/it]
The amount of data points is 68931
The slope of the best fitting line is  0.799
The correlation coefficient is: 0.935
 The mean square error is: 0.04892873500306341
No description has been provided for this image
 65%|██████▍   | 11/17 [11:15<06:08, 61.41s/it]
The amount of data points is 70794
The slope of the best fitting line is  0.747
The correlation coefficient is: 0.917
 The mean square error is: 0.06495627808226698
No description has been provided for this image
 71%|███████   | 12/17 [12:14<05:03, 60.65s/it]
The amount of data points is 68931
The slope of the best fitting line is  0.779
The correlation coefficient is: 0.925
 The mean square error is: 0.06714903125882138
No description has been provided for this image
 76%|███████▋  | 13/17 [13:14<04:01, 60.49s/it]
The amount of data points is 70794
The slope of the best fitting line is  0.749
The correlation coefficient is: 0.931
 The mean square error is: 0.07902781067276396
No description has been provided for this image
 82%|████████▏ | 14/17 [14:13<03:00, 60.18s/it]
The amount of data points is 70794
The slope of the best fitting line is  0.831
The correlation coefficient is: 0.948
 The mean square error is: 0.05634470354228901
No description has been provided for this image
 88%|████████▊ | 15/17 [15:14<02:00, 60.19s/it]
The amount of data points is 68931
The slope of the best fitting line is  0.79
The correlation coefficient is: 0.934
 The mean square error is: 0.05393667584524409
No description has been provided for this image
 94%|█████████▍| 16/17 [16:13<00:59, 59.99s/it]
The amount of data points is 70794
The slope of the best fitting line is  0.762
The correlation coefficient is: 0.927
 The mean square error is: 0.06480253521058248
No description has been provided for this image
100%|██████████| 17/17 [17:13<00:00, 60.80s/it]
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Other Years (Daily)¶

In [ ]:
r_all2 = np.array([])
rms_all2 = np.array([])
slope_all2 = np.array([])

for i in tqdm(range (0, len(ds.time_counter))):
    
    dataset = ds.isel(time_counter=i)
    dataset2 = ds2.isel(time_counter=i)

    drivers, diat, _ = datasets_preparation(dataset, dataset2)

    r, rms, m = regressor3(drivers, diat)

    r_all2 = np.append(r_all2,r)
    rms_all2 = np.append(rms_all2,rms)
    slope_all2 = np.append(slope_all2,m)

plotting2(r_all2, 'Correlation Coefficients')
plotting2(rms_all2, 'Root Mean Square Errors')
plotting2(slope_all2, 'Slope of the best fitting line')
  2%|▏         | 15/640 [15:13<10:34:15, 60.89s/it]
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Cell In[14], line 12
      8 dataset2 = ds2.isel(time_counter=i)
     10 drivers, diat, _ = datasets_preparation(dataset, dataset2)
---> 12 r, rms, m = regressor3(drivers, diat)
     14 r_all2 = np.append(r_all2,r)
     15 rms_all2 = np.append(rms_all2,rms)

Cell In[5], line 5, in regressor3(inputs, targets)
      1 def regressor3 (inputs, targets):
      3     inputs2 = inputs.transpose()
----> 5     outputs_test = regr.predict(inputs2)
      7     # compute slope m and intercept b
      8     m, b = np.polyfit(targets, outputs_test, deg=1)

File ~/conda_envs/analysis-ilias/lib/python3.11/site-packages/sklearn/ensemble/_bagging.py:1191, in BaggingRegressor.predict(self, X)
   1188 # Parallel loop
   1189 n_jobs, _, starts = _partition_estimators(self.n_estimators, self.n_jobs)
-> 1191 all_y_hat = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
   1192     delayed(_parallel_predict_regression)(
   1193         self.estimators_[starts[i] : starts[i + 1]],
   1194         self.estimators_features_[starts[i] : starts[i + 1]],
   1195         X,
   1196     )
   1197     for i in range(n_jobs)
   1198 )
   1200 # Reduce
   1201 y_hat = sum(all_y_hat) / self.n_estimators

File ~/conda_envs/analysis-ilias/lib/python3.11/site-packages/sklearn/utils/parallel.py:67, in Parallel.__call__(self, iterable)
     62 config = get_config()
     63 iterable_with_config = (
     64     (_with_config(delayed_func, config), args, kwargs)
     65     for delayed_func, args, kwargs in iterable
     66 )
---> 67 return super().__call__(iterable_with_config)

File ~/conda_envs/analysis-ilias/lib/python3.11/site-packages/joblib/parallel.py:1952, in Parallel.__call__(self, iterable)
   1946 # The first item from the output is blank, but it makes the interpreter
   1947 # progress until it enters the Try/Except block of the generator and
   1948 # reach the first `yield` statement. This starts the aynchronous
   1949 # dispatch of the tasks to the workers.
   1950 next(output)
-> 1952 return output if self.return_generator else list(output)

File ~/conda_envs/analysis-ilias/lib/python3.11/site-packages/joblib/parallel.py:1595, in Parallel._get_outputs(self, iterator, pre_dispatch)
   1592     yield
   1594     with self._backend.retrieval_context():
-> 1595         yield from self._retrieve()
   1597 except GeneratorExit:
   1598     # The generator has been garbage collected before being fully
   1599     # consumed. This aborts the remaining tasks if possible and warn
   1600     # the user if necessary.
   1601     self._exception = True

File ~/conda_envs/analysis-ilias/lib/python3.11/site-packages/joblib/parallel.py:1707, in Parallel._retrieve(self)
   1702 # If the next job is not ready for retrieval yet, we just wait for
   1703 # async callbacks to progress.
   1704 if ((len(self._jobs) == 0) or
   1705     (self._jobs[0].get_status(
   1706         timeout=self.timeout) == TASK_PENDING)):
-> 1707     time.sleep(0.01)
   1708     continue
   1710 # We need to be careful: the job list can be filling up as
   1711 # we empty it and Python list are not thread-safe by
   1712 # default hence the use of the lock

KeyboardInterrupt: 

Daily Maps¶

In [ ]:
maps = random.sample(range(0,len(ds.time_counter)),10)

for i in tqdm(maps):

    dataset = ds.isel(time_counter=i)
    dataset2 = ds2.isel(time_counter=i)
    drivers, diat, indx = datasets_preparation(dataset, dataset2)

    diat_i = dataset['Diatom']

    regressor4(drivers, diat, 'Diatom ')
  0%|          | 0/10 [00:00<?, ?it/s]
The amount of data points is 1863
The slope of the best fitting line is  0.721
The correlation coefficient is: 0.917
 The mean square error is: 0.05682262296005913
No description has been provided for this image
No description has been provided for this image
 10%|█         | 1/10 [01:02<09:26, 62.94s/it]
The amount of data points is 1863
The slope of the best fitting line is  0.791
The correlation coefficient is: 0.926
 The mean square error is: 0.02917307477306278
No description has been provided for this image
No description has been provided for this image
 20%|██        | 2/10 [02:11<08:47, 65.99s/it]
The amount of data points is 1863
The slope of the best fitting line is  0.853
The correlation coefficient is: 0.894
 The mean square error is: 0.023679885789193063
No description has been provided for this image
No description has been provided for this image
 30%|███       | 3/10 [03:30<08:24, 72.08s/it]
The amount of data points is 1863
The slope of the best fitting line is  0.738
The correlation coefficient is: 0.79
 The mean square error is: 0.03089873869329355
No description has been provided for this image
No description has been provided for this image
 40%|████      | 4/10 [04:44<07:18, 73.00s/it]
The amount of data points is 1863
The slope of the best fitting line is  0.581
The correlation coefficient is: 0.64
 The mean square error is: 0.06719042531980075
No description has been provided for this image
No description has been provided for this image
 50%|█████     | 5/10 [05:54<05:58, 71.72s/it]
The amount of data points is 1863
The slope of the best fitting line is  0.599
The correlation coefficient is: 0.806
 The mean square error is: 0.08878353692556587
No description has been provided for this image
No description has been provided for this image
 60%|██████    | 6/10 [07:07<04:49, 72.36s/it]
The amount of data points is 1863
The slope of the best fitting line is  0.728
The correlation coefficient is: 0.855
 The mean square error is: 0.03185442339583298
No description has been provided for this image
No description has been provided for this image
 70%|███████   | 7/10 [08:17<03:34, 71.50s/it]
The amount of data points is 1863
The slope of the best fitting line is  0.578
The correlation coefficient is: 0.832
 The mean square error is: 0.08346775760164514
No description has been provided for this image
No description has been provided for this image
 80%|████████  | 8/10 [09:25<02:21, 70.51s/it]
The amount of data points is 1863
The slope of the best fitting line is  0.95
The correlation coefficient is: 0.818
 The mean square error is: 0.051104704879814236
No description has been provided for this image
No description has been provided for this image
 90%|█████████ | 9/10 [10:36<01:10, 70.54s/it]
The amount of data points is 1863
The slope of the best fitting line is  0.673
The correlation coefficient is: 0.792
 The mean square error is: 0.07310904781090435
No description has been provided for this image
No description has been provided for this image
100%|██████████| 10/10 [11:52<00:00, 71.24s/it]
100%|██████████| 10/10 [11:52<00:00, 71.24s/it]
In [ ]: